# -*-encoding:utf-8-*-
import re
import requests
import building
from bs4 import BeautifulSoup
import csv

host='http://newhouse.njhouse.com.cn'

#首次创建文件并写入中文编码,后续以追加方式写入文本
#已带b（文件模式）的模式打开csv文件可避免writerow方法占两行的情况（自动在行末尾添加两个换行符）
with open('some.csv', 'wb') as f:
	f.write(u'\ufeff'.encode('utf8'))
titles=[u'开盘日期',u'项目名称',u'项目推广名',u'项目类别',u'项目地址',u'联系电话']
titles+=[u'项目链接',u'区属',u'物业',u'简介',u'纳入网上销售总套数',u'可售总套数',u'认购套数',u'成交套数',u'今日认购套数',u'今日成交套数',u'均价',u'住宅类均价',u'住宅类当月均价',u'楼盘换手率']
titles+=[u'预售方案链接',u'预售方案编号',u'预售方案申报日期',u'期数',u'幢号',u'用途',u'销售面积(㎡)',u'套数(间、个)',u'装修类型',u'拟交付时间',u'拟销售均价(元/㎡,车位：万元/个)']
with open('some.csv', 'ab+') as f:
	writer = csv.writer(f)
	writer.writerow([title.encode('utf8') for title in titles])

def GetBuilding(tr):
	tds=tr.find_all('td')
	building={}
	building['open_time']=tds[0].get_text().strip()#开盘日期
	building['name']=tds[1].get_text().strip()#项目名称
	print building['name']
	building['advertisement']=tds[2].get_text().strip()#项目推广名
	building['project_type']=tds[3].get_text().strip()#项目类别
	building['address']=tds[4].get_text().strip()#项目地址
	building['phone']=tds[5].get_text().strip()#联系电话
	building['url']=tds[1].find('a').get('href').replace('..',host)#项目链接
	r=requests.get(building['url'])
	html=r.content.decode('gbk').encode('utf-8')

	soup=BeautifulSoup(html,'html.parser')	
	tables=soup.find_all('table')
	'''for idx,table in enumerate(tables):
		if len(table.find_all('table'))==0:
			with open('detail'+str(idx)+'.html','w') as fw:
				fw.write(str(table))'''
	building['owned']=tables[19].find_all('tr')[2].find_all('td')[1].get_text()#区属
	building['property']=tables[28].find_all('tr')[0].find_all('td')[1].get_text()#物业
	building['introduction']=tables[32].find_all('tr')[0].find_all('td')[0].get_text()#简介
	
	src1=soup.find_all('iframe')[3].get('src')#销售均价及数量，iframe下嵌套的一个iframe才是真正的url 
	src2=soup.find_all('iframe')[2].get('src')#获取销许
	with open('href.html','a') as fw:
		fw.writelines(src1)
		fw.writelines(src2)

	r=requests.get(src1)
	src=BeautifulSoup(r.content,'html.parser').iframe.get('src')
	src=src.split('/include/fdc_include',2)[1]
	src=src1.split('/include/fdc_include',2)[0]+'/include/fdc_include'+src
	r=requests.get(src)
	soup=BeautifulSoup(r.content,'html.parser')
	building['count_all']=soup.body.table.tr.find_all('td')[1].get_text().strip()#纳入网上销售总套数
	t=soup.body.find_all('table')[1]
	building['count_salable']=t.tr.find_all('td')[1].get_text().strip()#可售总套数
	building['count_subscription']=t.find_all('tr')[1].find_all('td')[3].get_text().strip()#认购套数
	building['count_traded']=t.find_all('tr')[2].find_all('td')[1].get_text().strip()#成交套数
	building['count_subscription_today']=t.find_all('tr')[5].find_all('td')[1].get_text().strip()#今日认购套数
	building['count_traded_today']=t.find_all('tr')[5].find_all('td')[3].get_text().strip()#今日成交套数
	building['price_average']=t.find_all('tr')[3].find_all('td')[1].get_text().strip()#均价
	building['price_average_home']=t.find_all('tr')[3].find_all('td')[3].get_text().strip()#住宅类均价
	building['price_average_home_month']=t.find_all('tr')[6].find_all('td')[1].get_text().strip()#住宅类当月均价
	building['rate_turnover']=t.find_all('tr')[7].find_all('td')[3].get_text().strip()#楼盘换手率

	r=requests.get(src2)
	soup=BeautifulSoup(r.content,'html.parser')
	div_authorization=soup.find_all('div',class_="scroll")[0]
	#print div_authorization.prettify()
	list_a=div_authorization.find_all("a", href=re.compile("http://(.*)"))#正则表达式匹配出http开头的超链接
	src=list_a[0].get('href')#获取南 京 市 商 品 房 预售方案
	
	r=requests.get(src)
	with open('detail.html','w') as fw:
		fw.write(r.content)
	with open('detail.html','a') as fw:
		fw.write('<script src="jquery-1.10.2.min.js"></script>')

	html=r.content.decode('gbk').encode('utf-8')
	soup=BeautifulSoup(html,'html.parser')
	

	row=[building['open_time'],building['name'],building['advertisement'],building['project_type'],building['address'],building['phone'],building['url']]
	row+=[building['owned'],building['property'],building['introduction']]
	row+=[building['count_all'],building['count_salable'],building['count_subscription'],building['count_traded'],building['count_subscription_today'],building['count_traded_today'],building['price_average'],building['price_average_home'],building['price_average_home_month'],building['rate_turnover']]
	

	spans=soup.find_all('span',class_="bodyMsg")
	if len(spans)==0:
		pass
	else:
		building['presale_url']=src#预售方案链接
		building['presale_id']=spans[0].get_text()#预售方案编号
		building['presale_time']=spans[len(spans)-1].get_text()#预售方案申报日期
		table=soup.find_all('table',class_="Demo")[1]#本次申请预售项目基本情况
		list_detail=[]
		for tr in table.tbody.find_all('tr'):
			detail=[]
			for td in tr.find_all('td'):
				#print td.get_text().encode('utf8', 'ignore')
				detail.append(td.get_text())
			list_detail.append(detail)
		building['presales_detail']=list_detail
		row+=[building['presale_url'],building['presale_id'],building['presale_time']]

	if building.has_key('presales_detail'):
		for detail in building['presales_detail']:
			list_detail=row+detail
			with open('some.csv', 'ab+') as f:
				writer = csv.writer(f)
				writer.writerow([title.encode('utf8') for title in list_detail])
	else:		
		with open('some.csv', 'ab+') as f:
			writer = csv.writer(f)
			writer.writerow([title.encode('utf8') for title in row])


url=host+'/kpgg/'
r=requests.get(url, stream=True)
html=r.content.decode('gbk').encode('utf-8')

soup =building.BeautifulSoup(html, 'html.parser')
tables= soup.body.find_all('table')

areas=tables[2]#代表区域的table
building1=tables[4]#第一块代表住宅的table
building2=tables[5]#第二块代表住宅的table
#print building2.prettify()
trs=[]
for tr in building1.find_all('tr'):
	trs.append(tr)
for tr in building2.find_all('tr'):
	trs.append(tr)
del trs[0]
for tr in trs:
	GetBuilding(tr)

